notebook.community

Edit and run



In [ ]:

    
author: Diogo Silva



In [2]:

    
import numpy as np
import seaborn as snb
import matplotlib.pyplot as plt



In [3]:

    
%matplotlib inline



In [4]:

    
from MyML.cluster.K_Means3 import K_Means
import MyML.helper.partition as partMod



In [5]:

    
n_samples = 1e4
n_samples = int(n_samples)
n_features = 2
n_clusters = 6



In [7]:

    
data = np.random.uniform(size=(n_samples, n_features)).astype(np.float32)
plt.plot(data[:,0],data[:,1],'.')









    Out[7]:





[<matplotlib.lines.Line2D at 0x7fb3b45e9c90>]



In [34]:

    
g1 = np.random.normal(loc=(0,0), scale=1, size=(n_samples/n_clusters, n_features))
g2 = np.random.normal(loc=(10,5), scale=0.5, size=(n_samples/n_clusters, n_features))
g3 = np.random.normal(loc=(0,8), scale=0.1, size=(n_samples/n_clusters, n_features))
g4 = np.random.normal(loc=(-10,5), scale=(0.25,1), size=(n_samples/n_clusters, n_features))
g5 = np.random.normal(loc=(5,10), scale=0.5, size=(n_samples/n_clusters, n_features))
g6 = np.random.normal(loc=(10,-5), scale=0.5, size=(n_samples/n_clusters, n_features))

data = np.vstack((g1,g2,g3,g4,g5,g6))
gt = np.empty(data.shape[0], dtype = np.int32)
for i in range(n_clusters):
    gt[g1.shape[0] * i:g1.shape[0] * i + g1.shape[0]] = i

plt.plot(data[:,0],data[:,1],'.')









    Out[34]:





[<matplotlib.lines.Line2D at 0x7f58e1871a50>]



In [35]:

    
np.unique(gt)









    Out[35]:





array([0, 1, 2, 3, 4, 5], dtype=int32)



In [36]:

    
foldername = "/home/diogoaos/QCThesis/datasets/gaussmix1e4/"
dataname = "data.csv"
gtname = "ground_truth.csv"



In [37]:

    
np.savetxt(foldername + dataname, data, delimiter = ",")
np.savetxt(foldername + gtname, gt, delimiter = ",")



In [38]:

    
data = np.genfromtxt(foldername + dataname, delimiter = ",", dtype = np.float32)



In [8]:

    
n_partitions = 100
ensemble_clusters = [np.sqrt(n_samples)/2, np.sqrt(n_samples)]
ensemble_clusters = map(int, ensemble_clusters)
generator = K_Means(cuda_mem="manual")



In [40]:

    
%time partMod.generateEnsembleToFiles(foldername, data, generator, n_clusters=ensemble_clusters, npartitions=n_partitions)









    



CPU times: user 1.75 s, sys: 154 ms, total: 1.9 s
Wall time: 1.9 s



In [9]:

    
%time ensemble = partMod.generateEnsemble(data, generator, n_clusters=ensemble_clusters, npartitions=n_partitions)









    



CPU times: user 1.7 s, sys: 268 ms, total: 1.97 s
Wall time: 1.97 s

Build co-association matrix

Load ensemble



In [17]:

    
import MyML.cluster.eac as eac



In [42]:

    
reload(eac)









    Out[42]:





<module 'MyML.cluster.eac' from '/home/diogoaos/QCThesis/MyML/cluster/eac.pyc'>



In [43]:

    
foldername = "/home/diogoaos/QCThesis/datasets/gaussmix1e4/"









    



/home/diogoaos/QCThesis/datasets/gaussmix1e4/



In [44]:

    
ensemble = partMod.loadEnsembleFromFiles(foldername=foldername)



In [18]:

    
i=0



In [20]:

    
for clust in ensemble[i]:
    plt.plot(data[clust,0], data[clust, 1], '.')
i+=1
if i >= len(ensemble):
    i=0



In [194]:

    
n_samples = 0
for clust in ensemble[0]:
    n_samples += clust.size
print n_samples



In [67]:

    
fullEAC = eac.EAC(n_samples = n_samples, mat_sparse = False)



In [68]:

    
%time fullEAC.fit(ensemble, assoc_mode="full", prot_mode="none")









    



CPU times: user 3.58 s, sys: 0 ns, total: 3.58 s
Wall time: 3.53 s

Total time = 3.37s



In [69]:

    
full_nnz = fullEAC._coassoc.nonzero()[0].size
print "full matrix edges/vertices ratio : ", full_nnz * 1.0 / n_samples









    



full matrix edges/vertices ratio :  1666.0



In [161]:

    
coassoc = np.zeros((n_samples,n_samples))
%time eac.update_coassoc_with_ensemble(coassoc, ensemble)









    



CPU times: user 1.99 s, sys: 64.3 ms, total: 2.05 s
Wall time: 2.05 s

Total time = 987 ms



In [163]:

    
(coassoc == fullEAC._coassoc).all()









    Out[163]:





True



In [76]:

    
protEAC = eac.EAC(n_samples = n_samples, mat_sparse = False)



In [77]:

    
n_prots = int(0.1 * n_samples)
%time protEAC.fit(ensemble, assoc_mode="full", prot_mode="random", nprot=n_prots)









    



CPU times: user 1.85 s, sys: 7.84 ms, total: 1.86 s
Wall time: 1.86 s



In [78]:

    
protEAC_nnz = protEAC._coassoc.nonzero()[0].size
print "prot matrix edges/vertices ratio : ", full_nnz * 1.0 / n_samples









    



prot matrix edges/vertices ratio :  1666.0

Total time = 421 ms



In [79]:

    
sparseEAC = eac.EAC(n_samples = n_samples, mat_sparse = True)



In [80]:

    
%time sparseEAC.fit(ensemble, assoc_mode="full")









    



CPU times: user 13min 58s, sys: 386 ms, total: 13min 58s
Wall time: 13min 58s

Total time = 13min 21s

Save full coassoc.



In [115]:

    
%time np.savetxt(foldername + "full_coassoc.csv", fullEAC._coassoc, fmt="%d", delimiter=",")









    



CPU times: user 37.4 s, sys: 149 ms, total: 37.5 s
Wall time: 37.5 s

Save full coassocs in CSR format.



In [82]:

    
from scipy.sparse import csr_matrix



In [116]:

    
%time full_sp = csr_matrix(fullEAC._coassoc)
%time prot_sp = csr_matrix(protEAC._coassoc)









    



CPU times: user 1.02 s, sys: 56.5 ms, total: 1.08 s
Wall time: 1.08 s
CPU times: user 770 ms, sys: 0 ns, total: 770 ms
Wall time: 771 ms



In [114]:

    
fullEAC._coassoc[np.diag_indices_from(fullEAC._coassoc)] = 0
protEAC._coassoc[np.diag_indices_from(protEAC._coassoc)] = 0



In [157]:

    
np.where(fullEAC._coassoc[0:1666,0:1666]==100)[0]









    Out[157]:





array([   3,   14,   19,   26,   26,   26,   34,   35,   36,   36,   38,
         39,   43,   44,   44,   49,   61,   67,   71,   71,   76,   79,
         85,   86,   86,   86,   87,   89,   92,   95,   95,  100,  102,
        102,  120,  121,  126,  133,  133,  134,  138,  139,  144,  144,
        144,  144,  147,  147,  148,  148,  148,  161,  163,  177,  178,
        178,  188,  188,  188,  189,  191,  192,  194,  196,  199,  203,
        203,  205,  206,  208,  208,  209,  214,  231,  231,  231,  231,
        234,  234,  237,  238,  238,  240,  246,  246,  247,  248,  248,
        248,  249,  259,  261,  261,  264,  267,  267,  273,  291,  291,
        291,  298,  298,  301,  301,  303,  311,  311,  311,  311,  312,
        316,  317,  320,  326,  326,  327,  332,  332,  332,  337,  337,
        337,  342,  344,  344,  350,  357,  357,  371,  372,  380,  386,
        386,  386,  391,  393,  397,  397,  397,  403,  403,  406,  407,
        410,  416,  425,  425,  430,  430,  436,  440,  448,  448,  450,
        450,  451,  451,  452,  452,  458,  463,  466,  467,  467,  467,
        469,  471,  478,  481,  483,  483,  483,  483,  484,  489,  491,
        491,  495,  495,  495,  495,  498,  503,  509,  510,  511,  514,
        515,  515,  516,  516,  517,  517,  519,  519,  519,  528,  530,
        531,  544,  546,  546,  549,  549,  550,  551,  553,  557,  559,
        560,  560,  562,  562,  562,  562,  567,  574,  580,  581,  591,
        599,  611,  611,  611,  611,  620,  629,  631,  633,  633,  636,
        636,  636,  637,  637,  638,  639,  643,  650,  650,  672,  676,
        676,  677,  677,  683,  683,  684,  685,  685,  685,  689,  689,
        693,  696,  698,  707,  707,  707,  708,  708,  711,  716,  719,
        728,  729,  729,  729,  731,  739,  740,  740,  747,  747,  748,
        750,  752,  756,  756,  765,  771,  771,  771,  771,  783,  784,
        790,  811,  814,  821,  821,  821,  821,  822,  832,  832,  833,
        837,  837,  837,  840,  843,  847,  852,  854,  856,  856,  859,
        861,  866,  866,  866,  867,  872,  873,  875,  877,  877,  889,
        910,  910,  918,  918,  920,  922,  925,  925,  930,  933,  939,
        939,  939,  942,  948,  958,  958,  962,  964,  973,  983,  989,
        993,  994,  996, 1002, 1010, 1020, 1030, 1030, 1030, 1037, 1038,
       1041, 1041, 1047, 1047, 1047, 1050, 1052, 1053, 1053, 1053, 1056,
       1056, 1061, 1061, 1061, 1082, 1084, 1085, 1085, 1090, 1091, 1094,
       1103, 1105, 1105, 1105, 1105, 1115, 1120, 1120, 1130, 1130, 1133,
       1133, 1133, 1133, 1134, 1134, 1136, 1137, 1144, 1152, 1152, 1155,
       1155, 1156, 1156, 1158, 1162, 1162, 1162, 1170, 1174, 1176, 1187,
       1195, 1200, 1201, 1203, 1204, 1206, 1208, 1208, 1210, 1210, 1212,
       1212, 1212, 1216, 1216, 1216, 1219, 1223, 1226, 1228, 1230, 1230,
       1238, 1238, 1238, 1240, 1241, 1242, 1242, 1242, 1242, 1244, 1244,
       1246, 1247, 1249, 1255, 1258, 1264, 1264, 1265, 1266, 1273, 1286,
       1288, 1288, 1297, 1297, 1301, 1301, 1301, 1306, 1306, 1307, 1308,
       1315, 1315, 1323, 1323, 1323, 1326, 1327, 1327, 1327, 1328, 1328,
       1328, 1328, 1335, 1336, 1336, 1336, 1346, 1350, 1350, 1350, 1351,
       1354, 1365, 1369, 1370, 1373, 1374, 1375, 1376, 1378, 1379, 1379,
       1385, 1393, 1393, 1394, 1396, 1396, 1396, 1396, 1401, 1403, 1403,
       1404, 1412, 1417, 1419, 1425, 1427, 1429, 1436, 1437, 1438, 1438,
       1445, 1446, 1446, 1446, 1457, 1457, 1476, 1482, 1484, 1491, 1497,
       1503, 1506, 1507, 1512, 1512, 1520, 1521, 1525, 1527, 1527, 1537,
       1537, 1537, 1538, 1542, 1546, 1547, 1547, 1550, 1550, 1550, 1553,
       1554, 1561, 1564, 1565, 1567, 1590, 1594, 1594, 1601, 1602, 1606,
       1626, 1628, 1633, 1633, 1633, 1633, 1634, 1634, 1653, 1654, 1655,
       1655, 1655, 1660, 1662, 1663, 1663])



In [491]:

    
print full_sp.__str__
print prot_sp.__str__









    



<bound method csr_matrix.__str__ of <9996x9996 sparse matrix of type '<type 'numpy.float32'>'
	with 16254576 stored elements in Compressed Sparse Row format>>
<bound method csr_matrix.__str__ of <9996x9996 sparse matrix of type '<type 'numpy.float32'>'
	with 3084084 stored elements in Compressed Sparse Row format>>



In [117]:

    
np.savetxt(foldername + "full_dest.csr", full_sp.indices, fmt="%d",delimiter=",")
np.savetxt(foldername + "full_weight.csr", full_sp.data, fmt="%d",delimiter=",")
np.savetxt(foldername + "full_fe.csr", full_sp.indptr, fmt="%d",delimiter=",")



In [118]:

    
np.savetxt(foldername + "prot_dest.csr", prot_sp.indices, fmt="%d",delimiter=",")
np.savetxt(foldername + "prot_weight.csr", prot_sp.data, fmt="%d",delimiter=",")
np.savetxt(foldername + "prot_fe.csr", prot_sp.indptr, fmt="%d",delimiter=",")



In [ ]:

    
del full_sp, prot_sp

Evaluate sparsity with co-associasion threshold



In [21]:

    
reload(eac)









    Out[21]:





<module 'MyML.cluster.eac' from '/home/diogoaos/QCThesis/MyML/cluster/eac.pyc'>



In [51]:

    
fullEAC = eac.EAC(n_samples = n_samples, mat_sparse = False)
%time fullEAC.fit(ensemble, assoc_mode="full", prot_mode="none")









    



CPU times: user 4.38 s, sys: 35 ms, total: 4.41 s
Wall time: 4.42 s



In [23]:

    
print fullEAC.getMaxAssocs()[0]
print fullEAC.getNNZAssocs()



In [53]:

    
resPD=pd.DataFrame(columns=["col1","col2","col3"])



In [52]:

    
thresholds = np.arange(0,1.01,0.05)
res = np.empty((thresholds.size, 3))
for i in range(thresholds.size):
    res[i, 0] = thresholds[i]
    fullEAC.apply_threshold(thresholds[i])
    max_assocs, max_idx = fullEAC.getMaxAssocs()
    res[i, 1] = max_assocs
    nnz_pc = fullEAC.getNNZAssocs() / 8533572.0
    res[i, 2] = nnz_pc
    print thresholds[i], max_assocs, nnz_pc









    



0.0 1228 0.979769081459
0.05 794 0.613603072664
0.1 597 0.471317520963
0.15 496 0.386988004554
0.2 437 0.325645110863
0.25 367 0.276588279797
0.3 335 0.235490132385
0.35 294 0.200629232401
0.4 262 0.17024875398
0.45 232 0.143375599339
0.5 205 0.119374161254
0.55 167 0.0981199900815
0.6 142 0.0794258254339
0.65 121 0.0627570728881
0.7 104 0.0482775559871
0.75 88 0.0357306412836
0.8 76 0.0249665673413
0.85 68 0.0159323668916
0.9 57 0.00891279759519
0.95 38 0.00390621887294
1.0 7 0.0013216036614



In [25]:

    
import pandas as pd



In [30]:

    
resPD = pd.DataFrame(res, columns=["threshold","max_assocs", "nnz percent relative to max"])



In [33]:

    
print resPD.to_latex()









    



\begin{tabular}{lrrr}
\toprule
{} &  threshold &   max\_assocs &  nnz percent relative to max \\
\midrule
0  &   0.000000 &  1228.000000 &                     0.979769 \\
1  &   0.050000 &   794.000000 &                     0.613603 \\
2  &   0.100000 &   597.000000 &                     0.471318 \\
3  &   0.150000 &   496.000000 &                     0.386988 \\
4  &   0.200000 &   437.000000 &                     0.325645 \\
5  &   0.250000 &   367.000000 &                     0.276588 \\
6  &   0.300000 &   335.000000 &                     0.235490 \\
7  &   0.350000 &   294.000000 &                     0.200629 \\
8  &   0.400000 &   262.000000 &                     0.170249 \\
9  &   0.450000 &   232.000000 &                     0.143376 \\
10 &   0.500000 &   205.000000 &                     0.119374 \\
11 &   0.550000 &   167.000000 &                     0.098120 \\
12 &   0.600000 &   142.000000 &                     0.079426 \\
13 &   0.650000 &   121.000000 &                     0.062757 \\
14 &   0.700000 &   104.000000 &                     0.048278 \\
15 &   0.750000 &    88.000000 &                     0.035731 \\
16 &   0.800000 &    76.000000 &                     0.024967 \\
17 &   0.850000 &    68.000000 &                     0.015932 \\
18 &   0.900000 &    57.000000 &                     0.008913 \\
19 &   0.950000 &    38.000000 &                     0.003906 \\
20 &   0.000038 &     0.000086 &                     0.000046 \\
\bottomrule
\end{tabular}



In [ ]:

    
resPD8



In [35]:

    
fullEAC._coassoc









    Out[35]:





array([[ 100.,    0.,    0., ...,    0.,    0.,    0.],
       [   0.,  100.,    0., ...,    0.,    0.,    0.],
       [   0.,    0.,  100., ...,    0.,    0.,    0.],
       ..., 
       [   0.,    0.,    0., ...,  100.,    0.,    0.],
       [   0.,    0.,    0., ...,    0.,  100.,    0.],
       [   0.,    0.,    0., ...,    0.,    0.,  100.]], dtype=float32)



In [16]:

    
fullEAC._coassoc[::100,::100].shape









    Out[16]:





(100, 100)



In [18]:

    
plt.pcolor(fullEAC._coassoc[::50,::50])









    Out[18]:





<matplotlib.collections.PolyCollection at 0x7f2728871290>



In [ ]:

    
plt.pcolor()

SL-HAC



In [91]:

    
from numba import jit

@jit
def outdegree_from_firstedge(firstedge, outdegree, n_edges):
    n_vertices = firstedge.size
    for v in range(n_vertices - 1):
        outdegree[v] = firstedge[v + 1] - firstedge[v]
    outdegree[n_vertices - 1] = n_edges - firstedge[n_vertices - 1]

Get graph



In [6]:

    
print foldername









    



/home/diogoaos/QCThesis/datasets/gaussmix1e4/



In [119]:

    
%time dest = np.genfromtxt(foldername + "full_dest.csr", dtype = np.int32, delimiter=",")
%time weight = np.genfromtxt(foldername + "full_weight.csr", dtype = np.float32, delimiter=",")
%time fe = np.genfromtxt(foldername + "full_fe.csr", dtype = np.int32, delimiter=",")









    



CPU times: user 44 s, sys: 784 ms, total: 44.8 s
Wall time: 44.7 s
CPU times: user 39.3 s, sys: 682 ms, total: 40 s
Wall time: 39.9 s
CPU times: user 26.3 ms, sys: 26 µs, total: 26.3 ms
Wall time: 26.3 ms



In [86]:

    
dest = np.genfromtxt(foldername + "prot_dest.csr", dtype = np.int32, delimiter=",")
weight = np.genfromtxt(foldername + "prot_weight.csr", dtype = np.float32, delimiter=",")
fe = np.genfromtxt(foldername + "prot_fe.csr", dtype = np.int32, delimiter=",")



In [120]:

    
fe = fe[:-1]



In [121]:

    
od = np.empty_like(fe)
outdegree_from_firstedge(fe, od, dest.size)



In [126]:

    
weight = 100 - weight



In [127]:

    
print "# edges : ", dest.size
print "# vertices : ", fe.size
print "edges/vertices ratio : ", dest.size * 1.0 / fe.size









    



# edges :  16643340
# vertices :  9996
edges/vertices ratio :  1665.0



In [10]:

    
from numba import jit
@jit
def outdegree_from_firstedge(firstedge, outdegree, n_edges):
    n_vertices = firstedge.size
    for v in range(n_vertices - 1):
        outdegree[v] = firstedge[v + 1] - firstedge[v]
    outdegree[n_vertices - 1] = n_edges - firstedge[n_vertices - 1]

Cluster



In [106]:

    
from numba import cuda
import MyML.graph.mst as myMST
import MyML.graph.build as graphBuild
import MyML.graph.connected_components as ccomps
import MyML.cluster.linkage as linkage









    



/home/diogoaos/anaconda/lib/python2.7/site-packages/numba/cuda/decorators.py:106: UserWarning: autojit is deprecated and will be removed in a future release. Use jit instead.
  warn('autojit is deprecated and will be removed in a future release. Use jit instead.')



In [136]:

    
reload(linkage)









    Out[136]:





<module 'MyML.cluster.linkage' from '/home/diogoaos/QCThesis/MyML/cluster/linkage.pyc'>



In [142]:

    
%prun linkage.sl_mst_lifetime_gpu(dest, weight, fe, od, disconnect_weight = 100, MAX_TPB=512)



In [140]:

    
%time labels = linkage.sl_mst_lifetime_gpu(dest, weight, fe, od, disconnect_weight = 100, MAX_TPB=512)









    



CPU times: user 6.99 s, sys: 2.03 s, total: 9.02 s
Wall time: 9.02 s

Accuracy



In [149]:

    
import MyML.metrics.accuracy as accuracy



In [145]:

    
%time gt = np.genfromtxt(foldername + "ground_truth.csv", dtype = np.int32, delimiter=",")









    



CPU times: user 35.7 ms, sys: 0 ns, total: 35.7 ms
Wall time: 34.6 ms



In [151]:

    
scorer = accuracy.HungarianIndex(nsamples=gt.size)
%time scorer.score(gt, labels)









    



CPU times: user 83.2 ms, sys: 179 µs, total: 83.4 ms
Wall time: 73.7 ms






    



/home/diogoaos/QCThesis/MyML/helper/partition.py:57: FutureWarning: comparison to `None` will result in an elementwise object comparison in the future.
  if clusts == None:



In [152]:

    
print scorer.accuracy

1.0